
library(stringr)
library(dplyr)
library(tidyr)
library(zoo)

## note that this does not run nicely on a typical laptop, I ran it on hyak so it wouldn't run out of memory
## sets some variables -- add yours and comment mine out when you run!
#dataPath = './data/'
rawDataPath = '/gscratch/comdata/raw_data/python/'
processedDataPath = './data/'
paperDataPath = './knitr_rdata/'
source('lib-00-utils.R') #helper functions
###paperPath = '/home/kaylea/Dropbox/Apps/Overleaf/'
### =================\/ or whatever yours is
#paperPath = '/home/mkross/Dropbox/Apps/Overleaf/'

## loads up data from the data dir


pkgDF <- read.csv(paste0(rawDataPath, 'pymod_jiba.csv'))

aiDF <- read.csv(paste0(rawDataPath, 'bigquery_aiplus_matches.csv'))

vDF.dirty <- read.csv(paste0(rawDataPath, 'cve_mentions_pypi.csv'))

vDF <- data.frame(CVE=vDF.dirty$Name, bigUglyReferencesString=vDF.dirty$References, vulDateUgly=vDF.dirty$Phase)

#### here's how I got the text out:
test <- "blah blah https://pypi.org/project/test blah blah"
matchymatch <- '[^\\s]*(pypi\\.org|pypi\\.python\\.org)([^\\s]*)'
step1 <- str_extract_all(test, matchymatch)[[1]]
step1
step2 <- str_replace(step1, "https://pypi.org/project/", "")
step2
vDF$bigUglyReferencesString[1]
t <- str_extract(vDF$bigUglyReferencesString[1], matchymatch)
t ## it works!

vDF$pypiURL <- str_extract(vDF$bigUglyReferencesString, matchymatch) ### not all; just need one of them
head(vDF$pypiURL)

badPatterns <- 'CONFIRM:|MISC:|https://|http://|pypi.python.org/pypi/|pypi.org/project/|packaging.python.org' # add new items to remove here
## regex match to pull out just the pypi url
vDF$package <- str_replace_all(vDF$pypiURL, pattern=badPatterns, replacement="")
head(vDF$package)
vDF$package <- sapply(vDF$package, function(x) strsplit(x,'/')[[1]][[1]])
head(vDF$package)
vDF$package <- str_replace_all(vDF$package,'Products.','')
head(vDF$package) 
vDF$package <- str_replace_all(vDF$package,'Hotfix\\d.*','') #hotfix followed by a number
head(vDF$package) 
vDF$package <- str_replace_all(vDF$package,'\\.py','') # we don't want .py
head(vDF$package) ## some of these won't match because pypi changed its url representation
table(vDF$package) ## check if all of these are valid pypi packages, go back and fix any that aren't

vDF.bad <- subset(vDF, is.na(vDF$package)) ## fails go here; investigate manually and fix if needed
length(vDF.bad$CVE)
vDF <- subset(vDF, !is.na(vDF$package))

## that concludes the package cleaning. now we do the date cleaning (it's easier thank goodness)

vDF$vulDate <- str_replace_all(vDF$vulDateUgly, "Assigned \\(", "") 
head(vDF$vulDate)
vDF$vulDate <- str_replace_all(vDF$vulDate, "\\)", "") 
vDF$vulDate <- as.Date(vDF$vulDate, format="%Y%m%d")
vDF.full <- vDF
# drop cols we don't need
vDF$bigUglyReferencesString <- vDF$vulDateUgly <- vDF$pypiURL <- vDF$CVE <- NULL
vDF <- unique(vDF)
vDF$package <- tolower(vDF$package)
head(vDF) #yay!

### now we have 3 clean datasets but we need to merge them and turn them from wide to long

## easiest step first: if a package exists in aiDF, we code it as AI-related

aiDF$is_AI <- TRUE
aiDF$name <- tolower(aiDF$name)
pkgDF$Module.name <- tolower(pkgDF$Module.name) #normalize to all lower cases
fullDF <- merge(pkgDF, aiDF, by.x='Module.name', by.y='name', all.x=TRUE)
fullDF$is_AI[is.na(fullDF$is_AI)] <- 0 ## NA means no match means is_AI is 0
table(fullDF$is_AI)
fullDF$package <- fullDF$Module.name ## harmonize naming
fullDF$Module.name <- NULL ## harmonize naming
fullDF$package <- tolower(fullDF$package)

fullDF.orig <- fullDF

fullDF.long <- fullDF %>% 
  pivot_longer(
    cols = `X2016_05`:`X2023_12`, 
    names_to = "month",
    values_to = "downloads"
  )

fullDF.long$month <- str_replace_all(fullDF.long$month, "X", "") 
head(fullDF.long)
fullDF.long$month.clean <- as.Date(paste(fullDF.long$month, "1"), format="%Y_%m%d") #fix silly date format
head(fullDF.long)

### next question is vulnerabilities.
###
### let's get rid of any vulnerabilities where we don't have a package match

dim(vDF) #make sure row count doesn't increase!
vDF <- merge(vDF, fullDF, by='package', all.x=TRUE)
dim(vDF) #make sure row count doesn't increase!

vDF.nomatch <- subset(vDF, is.na(vDF$is_AI)) ## missings here ### TODO try to fix
vDF <- subset(vDF, !is.na(vDF$is_AI)) ## no missing values allowed

### most packages never have one in the CVE database, for lots of reasons, like functionality, popularity, etc. 
### so we're not going to try to compare those that never enter the DB to those that do
### being the
### most packages have only 1:
### but a few do have multiple
vulnTab <- as.data.frame(table(vDF$package))
#hist(multivuln) 
n.multivuln <- length(subset(vulnTab, vulnTab$Freq > 1)$Var1)
### I think what we want to do is look only at "first" vulnerability, 
### and for those that have multiple, we want to stop keeping their data starting with the month
### of the second vulnerability -- I don't think we have enough data to tease out the impact of subsequent
### vulnerabilities or otherwise distinguish one from the other
###
### my idea on how to do this:
### give each vulnerability a count
### work through the column: when we hit a vuln_count > 1, set downloads=NA that month and all subsequent months
###

fixDF <- vDF %>% group_by(package) %>% reframe(
  package=package,
  vulDate=vulDate,
  nth=rank(vulDate)
)
fixDF <- subset(fixDF, fixDF$nth == 2)
head(fixDF)
vDF$Total <- NULL #don't need

### pivot wide to long

long <- vDF %>% 
  pivot_longer(
    cols = `X2016_05`:`X2023_12`, 
    names_to = "month",
    values_to = "downloads"
  )

long$month <- str_replace_all(long$month, "X", "") 
head(long)
long$month.clean <- as.Date(paste(long$month, "1"), format="%Y_%m%d") #fix silly date format
head(long)

long.save <- long
#long <- subset(long, long$nth == 1) ## drop the row proliferation this caused

dim(long)
### I'm sure there's a cleaner way but this does work
for (p in fixDF$package) {
  remove_after_date <- subset(fixDF, fixDF$package==p)$vulDate 
  remove_after_date <- lubridate::floor_date(remove_after_date, 'month') ## set to first day of the month
  print(paste("checking", p, 'and removing rows after', remove_after_date))
  toClean <- subset(long, long$package==p)
  print(length(toClean$package))
  remains <- subset(long, long$package!=p)
  clean <- subset(toClean, toClean$month.clean < remove_after_date) 
  print(length(clean$package))
  long <- rbind(clean, remains)
}

dim(long)
dim(long.save)

##remove a few cols we don't need and fix names
long$month <- long$month.clean
long$month.clean <- NULL
dim(long)
vDF.orig <- vDF
vDF <- long

fullDF <- fullDF.long
fullDF.long <- NULL
fullDF$month <- fullDF$month.clean
fullDF$month.clean <- NULL
fullDF$Total <- NULL

## no duplicates allowed!
vDF <- unique(vDF)
fullDF <- unique(fullDF) ## this is the step that costs a lot of memory; hard to run on laptop



### add a var to handle post-discontinuity

### set birthday 
birthDF <- fullDF %>% group_by(package) %>% reframe(
	package = package,
	birthday = fullDF[which.max(downloads == min(downloads[downloads > 0])),]$month ## first non-zero month
)

birthDF <- unique(birthDF) 

### set months-old
fullDF <- merge(fullDF, birthDF, by='package')
fullDF$monthOfLife <- 12*(as.yearmon(fullDF$month) - as.yearmon(fullDF$birthday))
## set months-since-gpt
fullDF$monthsSinceGPT <- 12*(as.yearmon(fullDF$month) - as.yearmon('2022-11-30'))

## set months-since-vulnerability
vDF <- merge(vDF, birthDF, by='package')
vDF$monthOfLife <- 12*(as.yearmon(vDF$month) - as.yearmon(vDF$birthday))
vDF$monthsSinceVuln <- 12*(as.yearmon(vDF$month) - as.yearmon(vDF$vulDate))

### 

vDF <- subset(vDF, vDF$monthOfLife >= 0)
fullDF <- subset(fullDF, fullDF$monthOfLife >= 0)

fullDF <- unique(fullDF)
vDF <- unique(vDF)

###
n.vulnPK <- length(vDF$package) ## number of vulnerabilities
n.overallPK <- length(unique(fullDF$package))
n.obs <- length(fullDF$package)
n.ai <- length(aiDF$name)
### some fun facts
max.dl.row <- fullDF[which.max(fullDF$downloads),]


## save out just quantities of interest 
if (!nosave) {
	r <- list()
	remember(n.multivuln)
	remember(n.vulnPK)
	remember(n.ai)
	remember(n.overallPK)
	remember(n.obs)
	remember(max.dl.row)
	save(r, file=paste0(paperDataPath, "descriptives.RData"), version=2)
	print("Remembrances complete")
}

save(vDF,fullDF,file=paste0(processedDataPath,'readyToModel.RData'))
